import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('bank-full.csv')
data.shape
data.head(20)
data.columns
data[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'campaign', 'previous', 'poutcome', 'day', 'duration', 'pdays', 'previous', 'Target']].nunique()
data.describe().transpose()
data.dtypes
data['day'] = data['day'].apply(str)
data.dtypes
data.isna().sum()
cont_cols = list()
for col in data.columns:
if data[col].dtype == 'int64':
cont_cols.append(col)
plt.figure(figsize=(16, 8))
sns.distplot(data[col])
plt.show()
for col in data.columns:
if data[col].dtype == 'int64':
plt.figure(figsize=(16, 8))
sns.boxplot(data[col])
plt.show()
neg_balance = data[data['balance'] < 0]
neg_balance
plt.figure(figsize=(16, 8))
sns.boxplot(neg_balance['balance'])
def get_outlier_stats(data, col, rowmask, stats):
q25, q75 = np.percentile(data[col], 25), np.percentile(data[col], 75)
iqr = q75 - q25
min = np.min(data[col])
max = np.max(data[col])
lower = np.max([q25 - iqr*1.5, min])
upper = np.min([q75 + iqr*1.5, max])
#print("{} : 25%: {} 75%: {} IQR: {} Lower: {} Upper: {}".format(col, q25, q75, iqr, lower, upper))
arr = data[col].values
outliers = [x for x in arr if (x < lower or x > upper)]
for i in range(len(arr)):
if (arr[i] < lower or arr[i] > upper):
rowmask[i] = True
#print("No. of outlier rows: {}".format(np.count_nonzero(rowmask)))
#print("{}: Outliers: {}".format(col, outliers))
#print("{}: Outlier percentage: {}%".format(col, round(len(outliers)*100.0/len(arr), 2)))
stats[col] = [q25, q75, iqr, lower, upper, len(outliers), round(len(outliers)*100.0/len(arr), 2)]
rowmask = [False for x in range(data.shape[0])]
#print(len(rowmask))
stats = dict()
for col in cont_cols:
get_outlier_stats(data, col, rowmask, stats)
print("Total no. of outlier rows: {}".format(np.count_nonzero(rowmask)))
outliers = pd.DataFrame.from_dict(stats, orient='index', columns=['25 Percentile', '75 Percentile', 'IQR', 'Lower', 'Upper', 'Outliers', 'Outlier Percentage'])
outliers
from sklearn.neighbors import LocalOutlierFactor
data1 = data[cont_cols]
model = LocalOutlierFactor()
yhat = model.fit_predict(data1.values)
print("No. of outlier columns: {}".format(np.count_nonzero(yhat == -1)))
mask = yhat == -1
data1[mask]
data2 = data[yhat != -1]
print(data2.shape)
data2.head(10)
data2.describe().transpose()
rowmask = [False for x in range(data2.shape[0])]
#print(len(rowmask))
stats = dict()
for col in cont_cols:
get_outlier_stats(data2, col, rowmask, stats)
print("Total no. of outlier rows: {}".format(np.count_nonzero(rowmask)))
outliers = pd.DataFrame.from_dict(stats, orient='index', columns=['25 Percentile', '75 Percentile', 'IQR', 'Lower', 'Upper', 'Outliers', 'Outlier Percentage'])
outliers
from sklearn.preprocessing import RobustScaler
X = data[cont_cols].values
transformer = RobustScaler().fit(X)
X_t = transformer.transform(X)
X_t
scaledData = data.copy()
scaledData[cont_cols] = X_t
scaledData.head(20)
rowmask = [False for x in range(scaledData.shape[0])]
#print(len(rowmask))
stats = dict()
for col in cont_cols:
get_outlier_stats(scaledData, col, rowmask, stats)
print("Total no. of outlier rows: {}".format(np.count_nonzero(rowmask)))
outliers = pd.DataFrame.from_dict(stats, orient='index', columns=['25 Percentile', '75 Percentile', 'IQR', 'Lower', 'Upper', 'Outliers', 'Outlier Percentage'])
outliers
corr = data.corr()
sns.heatmap(corr, annot=True)
cat_cols = list()
for col in data.columns:
if data[col].dtype == 'object':
cat_cols.append(col)
plt.figure(figsize=(16, 8))
sns.countplot(data[col])
plt.show()
data.dtypes
data['day'] = data['day'].astype('category')
for col in cat_cols:
print("--- {} ----- ".format(col))
print(data[col].value_counts())
client_data_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan']
target = ['Target']
print(data['Target'].value_counts(normalize=True))
for col in client_data_cols:
print("--------{}----------".format(col))
print(pd.crosstab(data[col], data['Target'], normalize='index'))
for col in client_data_cols:
plt.figure(figsize=(16, 8))
sns.countplot(data[col], hue=data['Target'])
plt.show()
pd.crosstab(data['default'], data['Target'], normalize='index')
pd.crosstab(data['job'], data['Target'], normalize='index')
previous_data_cols = ['contact', 'month', 'day', 'poutcome']
for col in previous_data_cols:
plt.figure(figsize=(16, 8))
sns.countplot(data[col], hue=data['Target'])
plt.show()
previous_data_cols = ['contact', 'month', 'day', 'poutcome']
for col in previous_data_cols:
print("------------{}-------------".format(col))
print(pd.crosstab(data[col], data['Target'], normalize='index'))
cat_cols
pd.crosstab(data['job'], data['marital'], normalize='index')
pd.crosstab(data['default'], data['housing'], normalize='index')
pd.crosstab(data['default'], data['loan'], normalize='index')
pd.crosstab(data['marital'], data['education'], normalize='index')
pd.crosstab(data['loan'], data['housing'], normalize='index')
pd.crosstab(data['default'], data['Target'], normalize='index')
pd.crosstab(data['housing'], data['Target'], normalize='index')
for col in cat_cols:
print('-------------{}--------------'.format(col))
cf = pd.crosstab(data[col], data['Target'], normalize='index')
print(cf)
data.drop('marital', axis=1, inplace=True)
cat_cols.remove('marital')
cat_cols
data.groupby('Target').median()
for col in ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']:
plt.figure(figsize=(16, 8))
sns.catplot(x='Target', y=col, data=data, kind='boxen')
plt.show()
X = data.drop('Target', axis=1)
Y = data[['Target']]
X.shape, Y.shape
#X = pd.get_dummies(X)
X.head(10)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(Y)
Y = le.transform(Y)
Y = Y.reshape((len(Y), 1))
print(Y.shape)
Y
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=7)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape
scaler = RobustScaler()
X_conts = X_train[cont_cols]
scaler.fit(X_conts)
X_conts_scaled = scaler.transform(X_conts)
X_train_scaled = X_train.copy()
X_train_scaled[cont_cols] = X_conts_scaled
X_conts = X_test[cont_cols]
X_conts_scaled = scaler.transform(X_conts)
X_test_scaled = X_test.copy()
X_test_scaled[cont_cols] = X_conts_scaled
X_train_scaled.head(10)
X_test_scaled.head(10)
X_train = pd.get_dummies(X_train_scaled, drop_first=True)
X_test = pd.get_dummies(X_test_scaled, drop_first=True)
print(X_train.shape, X_test.shape)
scores = dict()
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix, f1_score, roc_auc_score
def acc_and_cm(actual, predict):
print('Accuracy: {}'.format(accuracy_score(actual, predict)))
cm = confusion_matrix(actual, predict)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=7)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
#print("Accuracy: {} Precision: {} Recall: {}".format(accuracy_score(Y_test, Y_pred), precision_score(Y_test, Y_pred), recall_score(Y_test, Y_pred)))
acc_and_cm(Y_test, Y_pred)
scores['Logistic Regression'] = [accuracy_score(Y_test, Y_pred), precision_score(Y_test, Y_pred), recall_score(Y_test, Y_pred), f1_score(Y_test, Y_pred), roc_auc_score(Y_test, Y_pred)]
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=5, random_state=7)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
#print("Accuracy: {} Precision: {} Recall: {}".format(accuracy_score(Y_test, Y_pred), precision_score(Y_test, Y_pred), recall_score(Y_test, Y_pred)))
acc_and_cm(Y_test, Y_pred)
scores['Decision Tree'] = [accuracy_score(Y_test, Y_pred), precision_score(Y_test, Y_pred), recall_score(Y_test, Y_pred), f1_score(Y_test, Y_pred), roc_auc_score(Y_test, Y_pred)]
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
models = (
('Random Forest', RandomForestClassifier(random_state=7)),
('Bagging', BaggingClassifier(random_state=7)),
('AdaBoost', AdaBoostClassifier(random_state=7)),
('Gradient Boosting', GradientBoostingClassifier(random_state=7))
)
for algo in models:
name, model = algo
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
#print("Model : {} Accuracy: {} Precision: {} Recall: {}".format(name, accuracy_score(Y_test, Y_pred), precision_score(Y_test, Y_pred), recall_score(Y_test, Y_pred)))
acc_and_cm(Y_test, Y_pred)
scores[name] = [accuracy_score(Y_test, Y_pred), precision_score(Y_test, Y_pred), recall_score(Y_test, Y_pred), f1_score(Y_test, Y_pred), roc_auc_score(Y_test, Y_pred)]
results = pd.DataFrame.from_dict(scores, orient='index', columns=['Accuracy', 'Precision', 'Recall', 'F1 score', 'ROC AUC score'])
results
from sklearn.model_selection import GridSearchCV
grid = dict()
grid['max_features'] = [5, 10, 15, 20, 25, 30, 40, 45]
model = GradientBoostingClassifier(random_state=7)
search = GridSearchCV(estimator=model, param_grid=grid, scoring='recall', cv=None)
results = search.fit(X_train, Y_train)
results
results.cv_results_['mean_test_score']
model = BaggingClassifier(random_state=7)
grid = dict()
grid['n_estimators'] = [12, 15, 20, 25, 30]
search = GridSearchCV(estimator=model, param_grid=grid, scoring='recall', cv=None)
results = search.fit(X_train, Y_train)
results
results.cv_results_['mean_test_score']
model = BaggingClassifier(n_estimators=15, random_state=7)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
acc_and_cm(Y_test, Y_pred)
print(classification_report(Y_test, Y_pred))
model = RandomForestClassifier(random_state=7)
model.fit(X_train, Y_train)
importances = model.feature_importances_
features = X_train.columns
df = pd.DataFrame(importances, index=features, columns=['importance'])
df.sort_values('importance', ascending=False).head(20)
selected_features = df[df['importance'] > 0.01].index
selected_features
newdata = pd.read_csv('bank-full.csv')
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
newdata[cat_cols] = newdata[cat_cols].astype('category')
print(newdata.dtypes)
X = newdata.drop('Target', axis=1)
X = pd.get_dummies(X, drop_first=True)
Y = newdata[['Target']]
le = LabelEncoder()
le.fit(Y)
Y = le.transform(Y)
Y = Y.reshape((len(Y), 1))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=7)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape
X_train.head(5)
X_test.head(5)
scores = dict()
models = (
('Random Forest', RandomForestClassifier(random_state=7)),
('Bagging', BaggingClassifier(random_state=7)),
('AdaBoost', AdaBoostClassifier(random_state=7)),
('Gradient Boosting', GradientBoostingClassifier(random_state=7))
)
for algo in models:
name, model = algo
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
#print("Model : {} Accuracy: {} Precision: {} Recall: {}".format(name, accuracy_score(Y_test, Y_pred), precision_score(Y_test, Y_pred), recall_score(Y_test, Y_pred)))
acc_and_cm(Y_test, Y_pred)
scores[name] = [accuracy_score(Y_test, Y_pred), precision_score(Y_test, Y_pred), recall_score(Y_test, Y_pred), f1_score(Y_test, Y_pred), roc_auc_score(Y_test, Y_pred)]
results = pd.DataFrame.from_dict(scores, orient='index', columns=['Accuracy', 'Precision', 'Recall', 'F1 score', 'ROC AUC score'])
results
X_train = pd.get_dummies(X_train_scaled, drop_first=True)
X_test = pd.get_dummies(X_test_scaled, drop_first=True)
print(X_train.shape, X_test.shape)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
fs = SelectKBest(score_func=f_classif, k='all')
fs.fit(X_train, Y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
model = BaggingClassifier(n_estimators=15, random_state=7)
model.fit(X_train_fs, Y_train)
Y_pred = model.predict(X_test_fs)
print("Recall: {}".format(recall_score(Y_test, Y_pred)))
acc_and_cm(Y_test, Y_pred)
X_train = X_train[selected_features]
X_test = X_test[selected_features]
scores = dict()
models = (
('Random Forest', RandomForestClassifier(random_state=7)),
('Bagging', BaggingClassifier(random_state=7)),
('AdaBoost', AdaBoostClassifier(random_state=7)),
('Gradient Boosting', GradientBoostingClassifier(random_state=7))
)
for algo in models:
name, model = algo
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
#print("Model : {} Accuracy: {} Precision: {} Recall: {}".format(name, accuracy_score(Y_test, Y_pred), precision_score(Y_test, Y_pred), recall_score(Y_test, Y_pred)))
acc_and_cm(Y_test, Y_pred)
scores[name] = [accuracy_score(Y_test, Y_pred), precision_score(Y_test, Y_pred), recall_score(Y_test, Y_pred), f1_score(Y_test, Y_pred), roc_auc_score(Y_test, Y_pred)]
results = pd.DataFrame.from_dict(scores, orient='index', columns=['Accuracy', 'Precision', 'Recall', 'F1 score', 'ROC AUC score'])
results